import pandas as pd
import numpy as np
import matplotlib
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
df = pd.read_csv(("cancer_data.csv"))
df
| Gender | Symptoms | Alcohol | Hepatitis B Surface Antigen | Hepatitis B e Antigen | Hepatitis B Core Antibody | Hepatitis C Virus Antibody | Cirrhosis | Endemic Countries | Smoking | ... | Alkaline phosphatase (U/L) | Total Proteins (g/dL) | Creatinine (mg/dL) | Number of Nodules | Major dimension of nodule (cm) | Direct Bilirubin (mg/dL) | Iron (mcg/dL) | Oxygen Saturation (%) | Ferritin | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | ... | 150.0 | 7.1 | 0.70 | 1 | 3.5 | 0.50 | 86.0 | 37.0 | 435.0 | 1 |
| 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | ... | 215.0 | 8.7 | 2.95 | 1 | 1.8 | 1.96 | 86.0 | 37.0 | 435.0 | 1 |
| 2 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | ... | 109.0 | 7.0 | 2.10 | 5 | 13.0 | 0.10 | 28.0 | 6.0 | 16.0 | 1 |
| 3 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | ... | 174.0 | 8.1 | 1.11 | 2 | 15.7 | 0.20 | 86.0 | 37.0 | 435.0 | 0 |
| 4 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | ... | 109.0 | 6.9 | 1.80 | 1 | 9.0 | 1.96 | 59.0 | 15.0 | 22.0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 126 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | ... | 117.0 | 7.2 | 0.69 | 1 | 13.0 | 0.70 | 86.0 | 37.0 | 435.0 | 1 |
| 127 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 182.0 | 6.2 | 0.77 | 2 | 4.3 | 1.00 | 93.0 | 47.0 | 307.0 | 1 |
| 128 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | ... | 127.0 | 6.9 | 1.11 | 2 | 4.3 | 1.96 | 28.0 | 10.0 | 308.0 | 0 |
| 129 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 171.0 | 7.1 | 0.66 | 2 | 5.8 | 1.96 | 86.0 | 37.0 | 435.0 | 0 |
| 130 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 150.0 | 5.4 | 0.70 | 5 | 17.5 | 0.30 | 86.0 | 37.0 | 435.0 | 1 |
131 rows × 50 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 131 entries, 0 to 130 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 131 non-null int64 1 Symptoms 131 non-null int64 2 Alcohol 131 non-null int64 3 Hepatitis B Surface Antigen 131 non-null int64 4 Hepatitis B e Antigen 131 non-null int64 5 Hepatitis B Core Antibody 131 non-null int64 6 Hepatitis C Virus Antibody 131 non-null int64 7 Cirrhosis 131 non-null int64 8 Endemic Countries 131 non-null int64 9 Smoking 131 non-null int64 10 Diabetes 131 non-null int64 11 Obesity 131 non-null int64 12 Hemochromatosis 131 non-null int64 13 Arterial Hypertension 131 non-null int64 14 Chronic Renal Insufficiency 131 non-null int64 15 Human Immunodeficiency Virus 131 non-null int64 16 Nonalcoholic Steatohepatitis 131 non-null int64 17 Esophageal Varices 131 non-null int64 18 Splenomegaly 131 non-null int64 19 Portal Hypertension 131 non-null int64 20 Portal Vein Thrombosis 131 non-null int64 21 Liver Metastasis 131 non-null int64 22 Radiological Hallmark 131 non-null int64 23 Age at diagnosis 131 non-null int64 24 Grams of Alcohol per day 131 non-null int64 25 Packs of cigarets per year: 131 non-null float64 26 Performance Status 131 non-null int64 27 Encefalopathy degree 131 non-null int64 28 Ascites degree 131 non-null int64 29 International Normalised Ratio: 131 non-null float64 30 Alpha-Fetoprotein (ng/mL) 131 non-null float64 31 Haemoglobin (g/dL) 131 non-null float64 32 Mean Corpuscular Volume (fl) 131 non-null float64 33 Leukocytes(G/L) 131 non-null float64 34 Platelets (G/L) 131 non-null float64 35 Albumin (mg/dL) 131 non-null float64 36 Total Bilirubin(mg/dL) 131 non-null float64 37 Alanine transaminase (U/L) 131 non-null float64 38 Aspartate transaminase (U/L) 131 non-null int64 39 Gamma glutamyl transferase (U/L) 131 non-null float64 40 Alkaline phosphatase (U/L) 131 non-null float64 41 Total Proteins (g/dL) 131 non-null float64 42 Creatinine (mg/dL) 131 non-null float64 43 Number of Nodules 131 non-null int64 44 Major dimension of nodule (cm) 131 non-null float64 45 Direct Bilirubin (mg/dL) 131 non-null float64 46 Iron (mcg/dL) 131 non-null float64 47 Oxygen Saturation (%) 131 non-null float64 48 Ferritin 131 non-null float64 49 Class 131 non-null int64 dtypes: float64(19), int64(31) memory usage: 51.3 KB
count_gender = df.loc[df.Gender].count()[0]
count_gender # count of gender data
131
#converting all columns containing 0 & 1 numnbers denoting male and female values
list = [df.columns[0:23]]
for i in list:
df[i] = df[i].replace([1], "male")
df[i] = df[i].replace([0], "female")
df.head() #values are changed
| Gender | Symptoms | Alcohol | Hepatitis B Surface Antigen | Hepatitis B e Antigen | Hepatitis B Core Antibody | Hepatitis C Virus Antibody | Cirrhosis | Endemic Countries | Smoking | ... | Alkaline phosphatase (U/L) | Total Proteins (g/dL) | Creatinine (mg/dL) | Number of Nodules | Major dimension of nodule (cm) | Direct Bilirubin (mg/dL) | Iron (mcg/dL) | Oxygen Saturation (%) | Ferritin | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | male | female | male | female | female | female | female | male | female | male | ... | 150.0 | 7.1 | 0.70 | 1 | 3.5 | 0.50 | 86.0 | 37.0 | 435.0 | 1 |
| 1 | female | male | female | female | female | female | male | male | female | male | ... | 215.0 | 8.7 | 2.95 | 1 | 1.8 | 1.96 | 86.0 | 37.0 | 435.0 | 1 |
| 2 | male | female | male | male | female | male | female | male | female | male | ... | 109.0 | 7.0 | 2.10 | 5 | 13.0 | 0.10 | 28.0 | 6.0 | 16.0 | 1 |
| 3 | male | male | male | female | female | female | female | male | female | male | ... | 174.0 | 8.1 | 1.11 | 2 | 15.7 | 0.20 | 86.0 | 37.0 | 435.0 | 0 |
| 4 | male | male | male | male | female | male | female | male | female | male | ... | 109.0 | 6.9 | 1.80 | 1 | 9.0 | 1.96 | 59.0 | 15.0 | 22.0 | 1 |
5 rows × 50 columns
# from pandas_profiling import ProfileReport
# profile = ProfileReport(df)
# profile.to_file(output_file = "cancer_data.html" ) # creation of pandas profile report for easy understanding of the data
df.shape # there are 40260 rows and 7 columns of data
(131, 50)
## Here we will check the percentage of nan values present in each feature
## 1 -step make the list of features which has missing values
features_with_na=[features for features in df.columns if df[features].isnull().sum()>1] #list comprehension use
## 2- step print the feature name and the percentage of missing values
for feature in features_with_na:
print(feature, np.round(df[feature].isnull().mean(), 4), ' % missing values')
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O'] # list comprehension feature that are not equal to object type
print('Number of numerical variables: ', len(numerical_features))
# visualise the numerical variables
df[numerical_features].head()
Number of numerical variables: 27
| Age at diagnosis | Grams of Alcohol per day | Packs of cigarets per year: | Performance Status | Encefalopathy degree | Ascites degree | International Normalised Ratio: | Alpha-Fetoprotein (ng/mL) | Haemoglobin (g/dL) | Mean Corpuscular Volume (fl) | ... | Alkaline phosphatase (U/L) | Total Proteins (g/dL) | Creatinine (mg/dL) | Number of Nodules | Major dimension of nodule (cm) | Direct Bilirubin (mg/dL) | Iron (mcg/dL) | Oxygen Saturation (%) | Ferritin | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 67 | 137 | 15.0 | 0 | 1 | 1 | 1.53 | 95.0 | 13.70 | 106.6 | ... | 150.0 | 7.1 | 0.70 | 1 | 3.5 | 0.50 | 86.0 | 37.0 | 435.0 | 1 |
| 1 | 62 | 0 | 23.0 | 0 | 1 | 1 | 1.39 | 22427.0 | 12.79 | 94.5 | ... | 215.0 | 8.7 | 2.95 | 1 | 1.8 | 1.96 | 86.0 | 37.0 | 435.0 | 1 |
| 2 | 78 | 50 | 50.0 | 2 | 1 | 2 | 0.96 | 5.8 | 8.90 | 79.8 | ... | 109.0 | 7.0 | 2.10 | 5 | 13.0 | 0.10 | 28.0 | 6.0 | 16.0 | 1 |
| 3 | 77 | 40 | 30.0 | 0 | 1 | 1 | 0.95 | 2440.0 | 13.40 | 97.1 | ... | 174.0 | 8.1 | 1.11 | 2 | 15.7 | 0.20 | 86.0 | 37.0 | 435.0 | 0 |
| 4 | 76 | 100 | 30.0 | 0 | 1 | 1 | 0.94 | 49.0 | 14.30 | 95.1 | ... | 109.0 | 6.9 | 1.80 | 1 | 9.0 | 1.96 | 59.0 | 15.0 | 22.0 | 1 |
5 rows × 27 columns
numerical_features
['Age at diagnosis', 'Grams of Alcohol per day', 'Packs of cigarets per year:', 'Performance Status', 'Encefalopathy degree', 'Ascites degree', 'International Normalised Ratio:', 'Alpha-Fetoprotein (ng/mL)', 'Haemoglobin (g/dL)', 'Mean Corpuscular Volume (fl)', 'Leukocytes(G/L)', 'Platelets (G/L)', 'Albumin (mg/dL)', 'Total Bilirubin(mg/dL)', 'Alanine transaminase (U/L)', 'Aspartate transaminase (U/L)', 'Gamma glutamyl transferase (U/L)', 'Alkaline phosphatase (U/L)', 'Total Proteins (g/dL)', 'Creatinine (mg/dL)', 'Number of Nodules', 'Major dimension of nodule (cm)', 'Direct Bilirubin (mg/dL)', 'Iron (mcg/dL)', 'Oxygen Saturation (%)', 'Ferritin', 'Class']
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 131 entries, 0 to 130 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 131 non-null object 1 Symptoms 131 non-null object 2 Alcohol 131 non-null object 3 Hepatitis B Surface Antigen 131 non-null object 4 Hepatitis B e Antigen 131 non-null object 5 Hepatitis B Core Antibody 131 non-null object 6 Hepatitis C Virus Antibody 131 non-null object 7 Cirrhosis 131 non-null object 8 Endemic Countries 131 non-null object 9 Smoking 131 non-null object 10 Diabetes 131 non-null object 11 Obesity 131 non-null object 12 Hemochromatosis 131 non-null object 13 Arterial Hypertension 131 non-null object 14 Chronic Renal Insufficiency 131 non-null object 15 Human Immunodeficiency Virus 131 non-null object 16 Nonalcoholic Steatohepatitis 131 non-null object 17 Esophageal Varices 131 non-null object 18 Splenomegaly 131 non-null object 19 Portal Hypertension 131 non-null object 20 Portal Vein Thrombosis 131 non-null object 21 Liver Metastasis 131 non-null object 22 Radiological Hallmark 131 non-null object 23 Age at diagnosis 131 non-null int64 24 Grams of Alcohol per day 131 non-null int64 25 Packs of cigarets per year: 131 non-null float64 26 Performance Status 131 non-null int64 27 Encefalopathy degree 131 non-null int64 28 Ascites degree 131 non-null int64 29 International Normalised Ratio: 131 non-null float64 30 Alpha-Fetoprotein (ng/mL) 131 non-null float64 31 Haemoglobin (g/dL) 131 non-null float64 32 Mean Corpuscular Volume (fl) 131 non-null float64 33 Leukocytes(G/L) 131 non-null float64 34 Platelets (G/L) 131 non-null float64 35 Albumin (mg/dL) 131 non-null float64 36 Total Bilirubin(mg/dL) 131 non-null float64 37 Alanine transaminase (U/L) 131 non-null float64 38 Aspartate transaminase (U/L) 131 non-null int64 39 Gamma glutamyl transferase (U/L) 131 non-null float64 40 Alkaline phosphatase (U/L) 131 non-null float64 41 Total Proteins (g/dL) 131 non-null float64 42 Creatinine (mg/dL) 131 non-null float64 43 Number of Nodules 131 non-null int64 44 Major dimension of nodule (cm) 131 non-null float64 45 Direct Bilirubin (mg/dL) 131 non-null float64 46 Iron (mcg/dL) 131 non-null float64 47 Oxygen Saturation (%) 131 non-null float64 48 Ferritin 131 non-null float64 49 Class 131 non-null int64 dtypes: float64(19), int64(8), object(23) memory usage: 51.3+ KB
a = 9 # number of rows
b = 3 # number of columns
c = 1 # initialize plot counter
fig = plt.figure(figsize=(20,55))
for i in numerical_features[0:26]:
plt.subplot(a, b, c)
plt.title('{}, subplot: {}{}{}'.format(i, a, b, c))
sns.histplot(data= df, x= i, hue="Gender")
# plt.title(f" Gender vs {i}")
c = c + 1
plt.show()
## Numerical variables are usually of 2 type
## 1. Continous variable and Discrete Variables
discrete_feature=[feature for feature in numerical_features if len(df[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))
Discrete Variables Count: 6
df[discrete_feature].head()
| Grams of Alcohol per day | Performance Status | Encefalopathy degree | Ascites degree | Number of Nodules | Class | |
|---|---|---|---|---|---|---|
| 0 | 137 | 0 | 1 | 1 | 1 | 1 |
| 1 | 0 | 0 | 1 | 1 | 1 | 1 |
| 2 | 50 | 2 | 1 | 2 | 5 | 1 |
| 3 | 40 | 0 | 1 | 1 | 2 | 0 |
| 4 | 100 | 0 | 1 | 1 | 1 | 1 |
discrete_feature
['Grams of Alcohol per day', 'Performance Status', 'Encefalopathy degree', 'Ascites degree', 'Number of Nodules', 'Class']
a = 2 # number of rows
b = 3 # number of columns
c = 1 # initialize plot counter
fig = plt.figure(figsize=(20,15))
for i in discrete_feature:
plt.subplot(a, b, c)
plt.title('{}, subplot: {}{}{}'.format(i, a, b, c))
sns.boxplot(x="Gender", y = i ,data=df)
# plt.title(f" Gender vs {i}")
c = c + 1
plt.show()
continuous_feature=[feature for feature in numerical_features if feature not in discrete_feature ]
print("Continuous feature Count {}".format(len(continuous_feature)))
Continuous feature Count 21
continuous_feature
len(continuous_feature)
21
a = 7 # number of rows
b = 3 # number of columns
c = 1 # initialize plot counter
fig = plt.figure(figsize=(20,55))
for i in continuous_feature:
plt.subplot(a, b, c)
plt.title('{} , subplot: {}{}{}'.format(i, a, b, c))
sns.histplot(data= df, x= i, hue="Gender", element="poly")
c = c + 1
plt.show()
a = 7 # number of rows
b = 3 # number of columns
c = 1 # initialize plot counter
fig = plt.figure(figsize=(20,55))
plt.style.use("ggplot")
for i in continuous_feature:
data=df.copy()
if 0 in data[i].unique():
pass
else:
plt.subplot(a, b, c)
plt.title('{}, subplot: {}{}{}'.format(i, a, b, c))
sns.boxplot(x="Gender", y = i ,data=df)
c = c + 1
plt.show()
categorical_features=[feature for feature in df.columns if df[feature].dtypes=='O']
categorical_features
['Gender', 'Symptoms', 'Alcohol', 'Hepatitis B Surface Antigen', 'Hepatitis B e Antigen', 'Hepatitis B Core Antibody', 'Hepatitis C Virus Antibody', 'Cirrhosis', 'Endemic Countries', 'Smoking', 'Diabetes', 'Obesity', 'Hemochromatosis', 'Arterial Hypertension', 'Chronic Renal Insufficiency', 'Human Immunodeficiency Virus', 'Nonalcoholic Steatohepatitis', 'Esophageal Varices', 'Splenomegaly', 'Portal Hypertension', 'Portal Vein Thrombosis', 'Liver Metastasis', 'Radiological Hallmark']
df[categorical_features].head()
| Gender | Symptoms | Alcohol | Hepatitis B Surface Antigen | Hepatitis B e Antigen | Hepatitis B Core Antibody | Hepatitis C Virus Antibody | Cirrhosis | Endemic Countries | Smoking | ... | Arterial Hypertension | Chronic Renal Insufficiency | Human Immunodeficiency Virus | Nonalcoholic Steatohepatitis | Esophageal Varices | Splenomegaly | Portal Hypertension | Portal Vein Thrombosis | Liver Metastasis | Radiological Hallmark | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | male | female | male | female | female | female | female | male | female | male | ... | female | female | female | female | male | female | female | female | female | male |
| 1 | female | male | female | female | female | female | male | male | female | male | ... | male | female | female | female | male | female | female | female | female | male |
| 2 | male | female | male | male | female | male | female | male | female | male | ... | male | male | female | female | female | female | male | female | male | male |
| 3 | male | male | male | female | female | female | female | male | female | male | ... | male | female | female | female | female | female | female | female | male | male |
| 4 | male | male | male | male | female | male | female | male | female | male | ... | male | male | female | female | female | female | female | female | female | male |
5 rows × 23 columns
for feature in categorical_features:
print('The feature is {} and number of categories are {}'.format(feature,len(df[feature].unique())))
The feature is Gender and number of categories are 2 The feature is Symptoms and number of categories are 2 The feature is Alcohol and number of categories are 2 The feature is Hepatitis B Surface Antigen and number of categories are 2 The feature is Hepatitis B e Antigen and number of categories are 2 The feature is Hepatitis B Core Antibody and number of categories are 2 The feature is Hepatitis C Virus Antibody and number of categories are 2 The feature is Cirrhosis and number of categories are 2 The feature is Endemic Countries and number of categories are 2 The feature is Smoking and number of categories are 2 The feature is Diabetes and number of categories are 2 The feature is Obesity and number of categories are 2 The feature is Hemochromatosis and number of categories are 2 The feature is Arterial Hypertension and number of categories are 2 The feature is Chronic Renal Insufficiency and number of categories are 2 The feature is Human Immunodeficiency Virus and number of categories are 2 The feature is Nonalcoholic Steatohepatitis and number of categories are 2 The feature is Esophageal Varices and number of categories are 2 The feature is Splenomegaly and number of categories are 2 The feature is Portal Hypertension and number of categories are 2 The feature is Portal Vein Thrombosis and number of categories are 2 The feature is Liver Metastasis and number of categories are 2 The feature is Radiological Hallmark and number of categories are 2
len(categorical_features)
23
a = 8 # number of rows
b = 3 # number of columns
c = 1 # initialize plot counter
fig = plt.figure(figsize=(20,55))
plt.style.use("ggplot")
for i in categorical_features:
plt.subplot(a, b, c)
plt.title('{} vs Cancer'.format(i))
df[i].value_counts().plot(kind='pie', autopct='%.4f')
plt.legend(loc = "best")
c = c + 1
plt.show()